In [1]:
import urllib2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn import metrics
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from bs4 import BeautifulSoup
%matplotlib inline
In [2]:
def get_tag_name(tag_url):
"""Takes a Atlas Obscura db dump and turns the tag_url into bare tags."""
return tag_url.split('/')[2]
def get_dummies_and_join(df,var_column):
"""gets dummies for a var_column in a dataframe and returns a dataframe with
those dummies joined on."""
df = df.join(pd.get_dummies(df[var_column]))
return df
def unique_article_set(df,column):
"""Takes a DataFrame and a column to groupby on and returns the DF, using
sum as the agfunc for the remaining columns. Typically this will be a
list of articles containing duplicates with an array of dummy vars
attached to it. Column is a string, df is a pandas DataFrame"""
return df.groupby(column).sum()
##Skipped over the Time Series section. Should come later.
def append_atlas_url(prefix,column):
"""Adds a given prefix to items in a Series and returns a list with the
prefix added to each item. Typically should be set equal to the input
Series so as to assign the new list back to the DataFrame."""
return [prefix + x for x in column]
def get_simplereach_url(x):
if type(x) == str:
return x.split('http://')[1]
else:
pass
def target_pageview_cutoff(pageviews,pv_column):
"""Creates a target vector of categorical values (1,0) based on whether a
particular article is in excess of the PV cutoff."""
return [1 if x > pageviews else 0 for x in pv_column]
def get_year(df,date_column):
"""Returns a Series of just the year based on a date column passed into the
function."""
return pd.DatetimeIndex(df[date_column]).year
def get_total_tagged(df,new_col_name):
"""Takes the articles and tags dataframe, sums the tag vectors, and Creates
a new dataframe containing the tags and the number of articles tagged."""
total_tagged = pd.DataFrame(data=df.sum(),columns=[new_col_name])
total_tagged = total_tagged.sort_values('num_tagged',ascending=False).drop(
'pageviews',
axis=0,
)
return total_tagged
def get_interaction_features(df,interaction_tags,total_tagged,tag_threshold):
"""Takes the articles and tags DataFrame and creates binary interaction
features using a list passed as interaction_tags as the tags to be paired
with the rest of the set. DataFrame must only contain index and
categorical features. DataFrame will return interaction features only for
tags having article counts above the threshold set as tag_threshold."""
for item in regular_features:
for column in df.drop(total_tagged[total_tagged.num_tagged < tag_threshold]).columns:
interactions[(item + '_' + column)] = df[item] + df[column]
return interactions
def correct_values(interactions_df):
"""Changes any values of 2 to 1 and 1s to 0s. This is because the dataframe
of interactions is created in get_interaction_features() by adding two columns
together to generate a composite column that represents an interaction
feature. Where there's a 2, that article has both tags. This should be 1.
Where there's a 1, the article only has 1 of the tags, and is falsey for that
tag."""
def two_to_one(x):
if x == 2.0:
return 1
else:
return 0
for x in interactions_df.columns:
interactions_df[x] = interactions_df[item].apply(two_to_one)
return interactions_df
def drop_identity_tags(interactions_df):
"""Takes the interaction dataframe and drops the tags where the interaction_tags
paired to one another."""
tagged_total = get_total_tagged(interactions_df,'num_tagged')
interactions_df = interactions_df.drop(tagged_total[0:26].index,axis=1)
return interaction_df
def drop_zero_cols(df):
"""Takes a dataframe and drops any column that sums to zero. This is used to
drop categorical variables that not a single datapoint are positive for. cols
should only be categorical variables encoded (0,1)."""
df = df.fillna(0)
for item in df.columns:
if df[item].sum() == 0:
df = df.drop(item,axis=1)
else:
continue
return df
def drop_tag_features(df,tag_threshold):
"""Takes a dataframe of articles and tag features and returns a subset of df
that contains only the tags that have corresponding article counts above the
tag_threshold."""
total_tagged = get_total_tagged(df,'num_tagged')
df2 = df.drop(total_tagged[total_tagged.num_tagged < tag_threshold].index,axis=1)
return df2
def get_cross_validation_score(X,y,estimator,n_folds):
"""Dependent on sklearn cross_validation and numpy. Takes an X and y and returns the cross_validation accuracy score for a given linear_model.estimator."""
kf = cross_validation.KFold(len(X),n_folds=n_folds)
scores = []
for train_index, test_index in kf:
lr = estimator.fit(X.iloc[train_index],y.iloc[train_index])
scores.append(lr.score(X.iloc[test_index],y.iloc[test_index]))
return np.mean(scores)
def get_probabilities(model,X):
"""Takes an estimator model that has been fix and the X used in that model
and returns the probabilities from predict_proba with the original feature
names zipped back up with them."""
interactions_model_scores = model.predict_proba(X)[:,1]
interactions_probabilities = pd.DataFrame(zip(X.columns,interactions_model_scores),columns=['tags','probabilities'])
interactions_probabilities = interactions_probabilities.sort_values('probabilities',ascending=False)
return interactions_probabilities
def get_sub_tags(df):
df = df.reset_index()
df['subtag'] = df.tags.apply(lambda x: x.split('_')[1])
return df
def get_pageviews(probabilities,df):
"""Sums the pageviews of all articles corresponding to a tag and returns the probabilities dataframe with a new column of total PVs for each tag."""
probabilities['pageview'] = [sum(df['pageviews'][df[item] == 1]) for item in probabilities['tags']]
return probabilities
def get_roc_scores(y,scores):
return metrics.roc_auc_score(y,scores)
def get_article_tags(url):
"""Uses BeautifulSoup to scrape Atlas pages for our tags"""
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml')
if "/articles/" in url:
taglist = [elem.get_text() for elem in soup.select('div.item-tags a')]
elif "/places" in url:
taglist = [elem.get_text() for elem in soup.select('div.item-tags a')]
else:
taglist = ""
for count,item in enumerate(taglist):
taglist[count] = item.split('#')[1]
return url, taglist
def transform_article_for_prediction(url,whole_set):
url, taglist = get_article_tags(url)
tags = {}
for item in taglist:
tags[item] = [1]
article_tag_vectors = pd.DataFrame(tags)
article_tag_vectors = article_tag_vectors.reindex(columns=whole_set.columns,
fill_value=0)
return article_tag_vectors
In [3]:
df = pd.read_csv('atlas-taggings.csv')
In [4]:
df[2:5]
Out[4]:
In [5]:
articles = df[df.tagged_type == 'Article']
In [6]:
articles.tag_url = articles.tag_url.apply(get_tag_name)
In [7]:
articles = get_dummies_and_join(articles,'tag_url')
articles = articles.drop(['tag_id','tag_url','tagged_type','tagged_id'],axis=1)
In [8]:
articles = unique_article_set(articles,'tagged_url')
In [9]:
articles = articles.reset_index().set_index('tagged_url')
In [10]:
pageviews = pd.read_csv('output_articles_performance.csv',header=None,names=[
'url','published','pageviews'
])
In [11]:
pageviews.url = ['www.atlasobscura.com/articles/' + x for x in pageviews.url]
In [12]:
pageviews.describe()
Out[12]:
In [13]:
pageviews.set_index('url',inplace=True)
article_set = articles.join(pageviews)
In [ ]:
In [14]:
article_set['ten_thousand'] = target_pageview_cutoff(10000,article_set.pageviews)
In [15]:
article_set['published'] = pd.to_datetime(article_set['published'])
article_set['year'] = get_year(article_set,'published')
In [54]:
article_set.pageviews.plot(kind='density',title='Page View Distribution, All Articles')
In [17]:
ax = article_set.boxplot(column='pageviews',by='year',figsize=(6,6),showfliers=False)
ax.set(title='PV distribution by year of publication, no outliers',ylabel='pageviews')
Out[17]:
In [18]:
sns.factorplot(
x='year',
y='ten_thousand',
data = article_set
)
Out[18]:
In [19]:
total_tagged = get_total_tagged(article_set,'num_tagged')
In [ ]:
In [20]:
article_set.fillna(value=0,inplace=True)
In [21]:
y = article_set.ten_thousand
X = article_set.drop(['pageviews','published','ten_thousand'],axis=1)
In [22]:
cross_val_score = get_cross_validation_score(X,y,linear_model.LogisticRegression(penalty = 'l1'),
n_folds=5)
In [23]:
lr = linear_model.LogisticRegression(penalty = 'l1').fit(X,y)
In [24]:
lr_scores = lr.predict_proba(X)[:,1]
roc_score = get_roc_scores(y,lr_scores)
print roc_score
In [25]:
single_tag_probabilities = get_probabilities(lr,X)
In [26]:
params = {'n_neighbors' : [x for x in range(2,100,4)],
'weights' : ['distance','uniform']}
gs = GridSearchCV(estimator = KNeighborsClassifier(),param_grid=params,
n_jobs=-1,cv=10,verbose=1)
gs.fit(X,y)
print gs.best_params_
print gs.best_score_
knn = gs.best_estimator_.fit(X,y)
knn_probs = get_probabilities(knn,X)
In [27]:
knn_cross_val_score = get_cross_validation_score(X,y,knn,5)
knn_scores = knn.predict_proba(X)[:,1]
knn_roc_score = get_roc_scores(y,knn_scores)
In [55]:
params_rfc = {'max_depth': np.arange(20,100,5),
'min_samples_leaf': np.arange(90,200,5),
'n_estimators': [20],
'criterion' : ['gini','entropy']
}
gs1 = GridSearchCV(RandomForestClassifier(),param_grid=params_rfc, cv=10, scoring='roc_auc',n_jobs=-1,verbose=1)
gs1.fit(X,y)
print gs1.best_params_
print gs1.best_score_
In [29]:
rf = gs1.best_estimator_
rf.fit(X,y)
rf_cross_val_score = get_cross_validation_score(X,y,rf,5)
rf_scores = rf.predict_proba(X)[:,1]
rf_roc_score = get_roc_scores(y,rf_scores)
In [30]:
print "Logistic Regression Cross-validation Score: ", cross_val_score
print "K Nearest Neighbors Cross-validation Score: ", knn_cross_val_score
print "RandomForest Cross-validation Score: ", rf_cross_val_score
print "Logistic Regressions ROC AUC score: ", roc_score
print "K Nearest Neighbors ROC AUC score: ", knn_roc_score
print "RandomForest ROC AUC score: ", rf_roc_score
In [31]:
url, taglist = get_article_tags('http://www.atlasobscura.com/articles/the-ao-exit-interview-12-years-in-the-blue-man-group')
In [32]:
transformed_article = transform_article_for_prediction(url,article_set)
In [ ]:
In [33]:
article_set.head(1)
Out[33]:
In [34]:
y1 = article_set[article_set.year >= 2016].ten_thousand
X1 = article_set[article_set.year >= 2016].drop(['pageviews','published','ten_thousand'],axis=1)
In [35]:
cross_val_score1 = get_cross_validation_score(X1,y1,linear_model.LogisticRegression(penalty = 'l1'),
n_folds=5)
lr1 = linear_model.LogisticRegression(penalty = 'l1').fit(X1,y1)
lr_scores1 = lr1.predict_proba(X1)[:,1]
roc_score1 = get_roc_scores(y1,lr_scores1)
print roc_score1
In [36]:
simplereach = pd.read_csv('~/Downloads/all-content-simplereach.csv')
In [37]:
simplereach.Url = simplereach.Url.apply(get_simplereach_url)
simplereach = simplereach.set_index('Url')
simplereach = simplereach[['Avg Engaged Time','Social Actions','Facebook Shares','FaceBook Referrals']]
In [38]:
article_set2 = article_set.join(simplereach['Facebook Shares'])
In [39]:
article_set2['five_hundred_shares'] = target_pageview_cutoff(500,article_set2['Facebook Shares'])
In [48]:
y2 = article_set2.five_hundred_shares
X2 = article_set2.drop(['pageviews',
'published',
'ten_thousand',
'Facebook Shares',
'five_hundred_shares'
],axis=1)
cross_val_score_social = get_cross_validation_score(X2,y2,linear_model.LogisticRegression(penalty = 'l1'),
n_folds=5)
lr_social = linear_model.LogisticRegression(penalty = 'l1').fit(X2,y2)
lr_scores_social = lr_social.predict_proba(X2)[:,1]
roc_score_social = get_roc_scores(y2,lr_scores_social)
print "Cross-val score when predicting Facebook shares > 500: ", cross_val_score_social
print "ROC AUC score when predicting Facebook shares > 500: ",roc_score_social
In [41]:
url = 'http://www.atlasobscura.com/articles/winters-effigies-the-deviant-history-of-the-snowman'
lr2.predict(transform_article_for_prediction(url,X2))
Out[41]:
In [50]:
params_social = {'n_neighbors' : [x for x in range(2,100,4)],
'weights' : ['distance','uniform']}
gs_social = GridSearchCV(estimator = KNeighborsClassifier(),param_grid=params,
n_jobs=-1,cv=10,verbose=1)
gs_social.fit(X2,y2)
print gs_social.best_params_
print gs_social.best_score_
knn_social = gs_social.best_estimator_.fit(X2,y2)
knn_probs_social = get_probabilities(knn_social,X2)
knn_cross_val_score_social = get_cross_validation_score(X2,y2,knn_social,5)
knn_scores_social = knn_social.predict_proba(X2)[:,1]
knn_roc_score_social = get_roc_scores(y2,knn_scores_social)
In [43]:
params_rfc = {'max_depth': np.arange(20,100,5),
'min_samples_leaf': np.arange(90,200,5),
'n_estimators': [20]}
gs1_social = GridSearchCV(RandomForestClassifier(),param_grid=params_rfc, cv=10, scoring='roc_auc',n_jobs=-1,verbose=1)
gs1_social.fit(X2,y2)
rf_social = gs1_social.best_estimator_
rf_social.fit(X2,y2)
rf_cross_val_score_social = get_cross_validation_score(X2,y2,rf_social,5)
rf_scores_social = rf_social.predict_proba(X)[:,1]
rf_roc_score_social = get_roc_scores(y2,rf_scores_social)
In [52]:
print gs1_social.best_params_
print gs1_social.best_score_
In [53]:
print "Logistic Regression Cross-validation Score: ", cross_val_score_social
print "K Nearest Neighbors Cross-validation Score: ", knn_cross_val_score_social
print "RandomForest Cross-validation Score: ", rf_cross_val_score_social
print "Logistic Regressions ROC AUC score: ", roc_score_social
print "K Nearest Neighbors ROC AUC score: ", knn_roc_score_social
print "RandomForest ROC AUC score: ", rf_roc_score_social
In [45]:
np.mean(y)
Out[45]:
In [46]:
simplereach.describe()
Out[46]:
In [ ]: